##Setup Install the correct packages and load the libraries in. If you have not installed the tidyverse package, make sure you uncomment the below lines.
#install.packages("tidyverse")
#install.packages("lubridate")
library(tidyverse)
library(lubridate)
#load the data
wichita <- read_csv("https://datajournalism.tech/wp-content/uploads/2019/10/wichita.csv")
population <- tibble( subject_race=c("asian/pacific islander", "black","hispanic","other/unknown","white"), num_people =c(19262, 42679, 63659, 13451, 246343))
Explore the dataset provided by Stanford University. See more on their website https://openpolicing.stanford.edu.
View(wichita) #to view the data table
str(wichita) #to see the characteristics of variables
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 57750 obs. of 22 variables:
## $ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
## $ raw_row_number : chr "923578" "923657" "912091" "923680" ...
## $ date : Date, format: "2016-01-01" "2016-01-01" ...
## $ time : 'hms' num 18:00:00 18:08:00 18:11:00 18:13:00 ...
## ..- attr(*, "units")= chr "secs"
## $ location : chr "N WEST ST, KS, 67205" "8000 W 13TH ST N, WICHITA, KS, 67212" "500 S LIMUEL ST, WICHITA, KS, 67235" "7600 W 21ST ST N, WICHITA, KS, 67205" ...
## $ lat : num 37.7 37.7 37.7 37.7 37.7 ...
## $ lng : num -97.4 -97.4 -97.5 -97.4 -97.4 ...
## $ subject_age : num 16 44 20 21 28 27 15 20 23 NA ...
## $ subject_race : chr "white" "white" "white" "hispanic" ...
## $ subject_sex : chr "female" "male" "male" "female" ...
## $ type : chr "vehicular" "vehicular" "vehicular" "vehicular" ...
## $ disposition : chr "DISMISSED" "GUILTY (IVR)" "DISMISSED WITH PREJUDICE; DISMISSED WITH PREJUDICE" "GUILTY" ...
## $ violation : chr "RUN STOP SIGN" "SPEED OVER LIMIT" "DUI; INATTENTIVE DRIVING" "SPEED OVER LIMIT" ...
## $ citation_issued : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ outcome : chr "citation" "citation" "citation" "citation" ...
## $ posted_speed : num NA 40 NA 40 40 40 NA NA NA NA ...
## $ vehicle_color : chr "BURGUNDY OR MAROON" "\"ALUMINUM, SILVER\"" "WHITE" "\"ALUMINUM, SILVER\"" ...
## $ vehicle_make : chr "JEEP (1989 TO PRESENT)" "HYUNDAI" "HONDA" "TOYOTA" ...
## $ vehicle_model : chr NA "TUCSON" NA NA ...
## $ vehicle_year : num 2008 NA NA NA NA ...
## $ raw_defendant_race : chr "W" "W" "W" "W" ...
## $ raw_defendant_ethnicity: chr "N" "N" "N" "H" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. raw_row_number = col_character(),
## .. date = col_date(format = ""),
## .. time = col_time(format = ""),
## .. location = col_character(),
## .. lat = col_double(),
## .. lng = col_double(),
## .. subject_age = col_double(),
## .. subject_race = col_character(),
## .. subject_sex = col_character(),
## .. type = col_character(),
## .. disposition = col_character(),
## .. violation = col_character(),
## .. citation_issued = col_logical(),
## .. outcome = col_character(),
## .. posted_speed = col_double(),
## .. vehicle_color = col_character(),
## .. vehicle_make = col_character(),
## .. vehicle_model = col_character(),
## .. vehicle_year = col_double(),
## .. raw_defendant_race = col_character(),
## .. raw_defendant_ethnicity = col_character()
## .. )
glimpse (wichita) #to see a short summary of values in each column
## Observations: 57,750
## Variables: 22
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...
## $ raw_row_number <chr> "923578", "923657", "912091", "923680"...
## $ date <date> 2016-01-01, 2016-01-01, 2016-01-01, 2...
## $ time <time> 18:00:00, 18:08:00, 18:11:00, 18:13:0...
## $ location <chr> "N WEST ST, KS, 67205", "8000 W 13TH S...
## $ lat <dbl> 37.74143, 37.70880, 37.67482, 37.72402...
## $ lng <dbl> -97.38976, -97.44059, -97.48999, -97.4...
## $ subject_age <dbl> 16, 44, 20, 21, 28, 27, 15, 20, 23, NA...
## $ subject_race <chr> "white", "white", "white", "hispanic",...
## $ subject_sex <chr> "female", "male", "male", "female", "m...
## $ type <chr> "vehicular", "vehicular", "vehicular",...
## $ disposition <chr> "DISMISSED", "GUILTY (IVR)", "DISMISSE...
## $ violation <chr> "RUN STOP SIGN", "SPEED OVER LIMIT", "...
## $ citation_issued <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR...
## $ outcome <chr> "citation", "citation", "citation", "c...
## $ posted_speed <dbl> NA, 40, NA, 40, 40, 40, NA, NA, NA, NA...
## $ vehicle_color <chr> "BURGUNDY OR MAROON", "\"ALUMINUM, SIL...
## $ vehicle_make <chr> "JEEP (1989 TO PRESENT)", "HYUNDAI", "...
## $ vehicle_model <chr> NA, "TUCSON", NA, NA, "SILVERADO", "NE...
## $ vehicle_year <dbl> 2008, NA, NA, NA, NA, NA, NA, 2008, 20...
## $ raw_defendant_race <chr> "W", "W", "W", "W", "W", "W", "W", "W"...
## $ raw_defendant_ethnicity <chr> "N", "N", "N", "H", "H", "N", "H", "H"...
colnames(wichita) #to view column headers
## [1] "X1" "raw_row_number"
## [3] "date" "time"
## [5] "location" "lat"
## [7] "lng" "subject_age"
## [9] "subject_race" "subject_sex"
## [11] "type" "disposition"
## [13] "violation" "citation_issued"
## [15] "outcome" "posted_speed"
## [17] "vehicle_color" "vehicle_make"
## [19] "vehicle_model" "vehicle_year"
## [21] "raw_defendant_race" "raw_defendant_ethnicity"
After viewing the dataset, you can analyze it to see the min, max, mean, median and other values for each variable. These are called descriptive statistics.
summary(wichita)
## X1 raw_row_number date time
## Min. : 1 Length:57750 Min. :2016-01-01 Length:57750
## 1st Qu.:14438 Class :character 1st Qu.:2016-03-16 Class1:hms
## Median :28876 Mode :character Median :2016-05-29 Class2:difftime
## Mean :28876 Mean :2016-06-10 Mode :numeric
## 3rd Qu.:43313 3rd Qu.:2016-08-31
## Max. :57750 Max. :2016-12-31
##
## location lat lng subject_age
## Length:57750 Min. :37.47 Min. :-101.36 Min. :11.00
## Class :character 1st Qu.:37.67 1st Qu.: -97.37 1st Qu.:24.00
## Mode :character Median :37.69 Median : -97.34 Median :33.00
## Mean :37.69 Mean : -97.33 Mean :36.71
## 3rd Qu.:37.70 3rd Qu.: -97.28 3rd Qu.:48.00
## Max. :38.48 Max. : -96.75 Max. :99.00
## NA's :1167 NA's :1167 NA's :10128
## subject_race subject_sex type
## Length:57750 Length:57750 Length:57750
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## disposition violation citation_issued outcome
## Length:57750 Length:57750 Mode:logical Length:57750
## Class :character Class :character TRUE:57750 Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## posted_speed vehicle_color vehicle_make vehicle_model
## Min. : 20.00 Length:57750 Length:57750 Length:57750
## 1st Qu.: 30.00 Class :character Class :character Class :character
## Median : 40.00 Mode :character Mode :character Mode :character
## Mean : 39.93
## 3rd Qu.: 40.00
## Max. :304.00
## NA's :35149
## vehicle_year raw_defendant_race raw_defendant_ethnicity
## Min. :1962 Length:57750 Length:57750
## 1st Qu.:2001 Class :character Class :character
## Median :2005 Mode :character Mode :character
## Mean :2005
## 3rd Qu.:2009
## Max. :2999
## NA's :43236
There are some verbs that you need to memorize. See more at https://learn.r-journalism.com/en/wrangling/dplyr/dplyr/ First, the select verb helps you grab column(s) in a dataset
race<- select(wichita,subject_race) #to select the subject_race columns
Second, the group_by verb helps you categorize your values into fewer groups. The summarize verb always goes along with the group_by to help count the number of values for each group and compute the percentage of each group over the whole population.
race <- group_by(race, subject_race) %>% summarize(value=n(), prop=value/nrow(.))
View(race) # to view the `race` table
stops<-left_join(race, population, by = "subject_race")
stops <- mutate(stops,stop_rate=value/num_people)
##Data Visualization We will need certain packages to be installed and called before creating our charts.
###Bar Chart
#make the plot
bar <- ggplot(stops,
aes(x=reorder(subject_race,stop_rate), y=stop_rate))+
geom_bar(stat="identity",
position="identity",
fill="yellow")+
geom_hline(yintercept = 0) +
labs(title="Stopped Drivers by Race",
subtitle = "African American drivers got stopped the most in the city of Wichita Kansas")+
coord_flip()
options(scipen=10000)
bar
###Interactive Map with leaflet
#install.packages("httpuv")
#install.packages("leaflet")
library(httpuv)
library(leaflet)
m <- leaflet() %>%
addTiles() %>%
setView(lng= -97.317163, lat= 37.685327, zoom=16) %>%
addMarkers(lng= -97.317163, lat= 37.685327, popup="Wichita, KS")
m
race <- colorFactor(c("coral1", "black", "yellow", "darkolivegreen", "darkgrey"), domain=c("white", "black", "asian/pacific islander", "hispanic", "other/unknown"), ordered=TRUE)
m2<- leaflet(wichita) %>%
addProviderTiles(providers$OpenStreetMap) %>%
setView(lng=-97.31716337, lat= 37.685327, zoom=11) %>%
addCircleMarkers(~lng, ~lat, popup=paste("This is a", wichita$subject_race, "and", wichita$subject_sex, "driver."), weight= 1, radius=2, color=~race(subject_race), stroke=F, fillOpacity=1)
## Warning in validateCoords(lng, lat, funcName): Data contains 1167 rows with
## either missing or invalid lat/lon values and will be ignored
m2